Carico le librerie
# Import libraries
pkg <- installed.packages()
if (!"DBI" %in% pkg[,1]) install.packages("DBI")
if (!"RPostgres" %in% pkg[,1]) install.packages("RPostgres")
if (!"dplyr" %in% pkg[,1]) install.packages("dplyr")
if (!"plotly" %in% pkg[,1]) install.packages("plotly")
if (!"caret" %in% pkg[,1]) install.packages("caret")
if (!"e1071" %in% pkg[,1]) install.packages("e1071")
if (!"glmnet" %in% pkg[,1]) install.packages("glmnet")
if (!"randomForest" %in% pkg[,1]) install.packages("randomForest")
require(DBI)
require(RPostgres)
require(dplyr)
require(plotly)
require(caret)
Connessione al db
# Connecting to db
con = dbConnect(
Postgres(),
user = 'my_user',
password = '&my_passw',
dbname = 'my_db',
host = 'my_host',
port = 25060,
sslmode = 'require'
)
Importo i dati in R
regressors_query <- dbSendQuery(con, "SELECT * FROM ccpp.regressors")
regressors <- dbFetch(regressors_query)
dbClearResult(regressors_query)
target_query <- dbSendQuery(con, "SELECT * FROM ccpp.target")
target <- dbFetch(target_query)
dbClearResult(target_query)
# Closing connection
dbDisconnect(con)
head(regressors)
| id | at | v | ap | rh |
|---|---|---|---|---|
| 0 | 8.34 | 40.77 | 1010.84 | 90.01 |
| 1 | 23.64 | 58.49 | 1011.40 | 74.20 |
| 2 | 29.74 | 56.90 | 1007.15 | 41.91 |
| 3 | 19.07 | 49.69 | 1007.22 | 76.79 |
| 4 | 11.80 | 40.66 | 1017.13 | 97.20 |
| 5 | 13.97 | 39.16 | 1016.05 | 84.60 |
head(target)
| id | pe |
|---|---|
| 0 | 480.48 |
| 1 | NA |
| 2 | 438.76 |
| 3 | 453.09 |
| 4 | 464.43 |
| 5 | 470.96 |
dim(target)
## [1] 9565 2
dim(regressors)
## [1] 9565 5
Merge/join delle tabelle regressors e target attraverso la chiave univoca ID:
data <- dplyr::inner_join(regressors, target, by = 'id')
head(data)
| id | at | v | ap | rh | pe |
|---|---|---|---|---|---|
| 0 | 8.34 | 40.77 | 1010.84 | 90.01 | 480.48 |
| 1 | 23.64 | 58.49 | 1011.40 | 74.20 | NA |
| 2 | 29.74 | 56.90 | 1007.15 | 41.91 | 438.76 |
| 3 | 19.07 | 49.69 | 1007.22 | 76.79 | 453.09 |
| 4 | 11.80 | 40.66 | 1017.13 | 97.20 | 464.43 |
| 5 | 13.97 | 39.16 | 1016.05 | 84.60 | 470.96 |
Analisi delle variabili covariate e target, distribuzione e valori mancanti (missing value):
summary(data[,-1])
## at v ap rh
## Min. : 1.81 Min. :25.36 Min. : 992.9 Min. : 25.56
## 1st Qu.:13.51 1st Qu.:41.74 1st Qu.: 1009.1 1st Qu.: 63.34
## Median :20.34 Median :52.08 Median : 1013.0 Median : 74.98
## Mean :19.65 Mean :54.30 Mean : 1023.7 Mean : 73.31
## 3rd Qu.:25.72 3rd Qu.:66.54 3rd Qu.: 1017.3 3rd Qu.: 84.84
## Max. :37.11 Max. :81.56 Max. :100499.0 Max. :100.16
## NA's :1
## pe
## Min. : 420.3
## 1st Qu.: 439.8
## Median : 451.6
## Mean : 501.1
## 3rd Qu.: 468.4
## Max. :447150.0
## NA's :1
Le variabili at e pe contengono 1 valore mancante ciascuna. Le variabili ap e pe hanno un valore massimo decisamente superiore alma media e 3° quartile: analizziamone la distribuzione per determinare la possibile presenza di outlier.
par(mfrow=c(1,2))
p1<-plot(data$ap, main = 'Plot ap')
p2<-hist(data$ap, main = 'Histogram ap')
p1<-plot(data$pe, main = 'Plot pe')
p2<-hist(data$pe, main = 'Histogram pe')
par(mfrow=c(1,1))
Dai grafici deduco che il regressore ap e la variabile dipendente pe hanno rispettivamente un valore anomalo outlier: decido di eslcudere questi valori dal dataset che utilizzerò per il modello.
Rimozione valori anomali e missing:
pos_NA <- c(which(is.na(data$at)),which(is.na(data$pe)))
pos_outlier <- c(which.max(data$ap),which.max(data$pe))
data_clean <- data[-c(pos_NA, pos_outlier),-1]
summary(data_clean)
## at v ap rh
## Min. : 1.81 Min. :25.36 Min. : 992.9 Min. : 25.56
## 1st Qu.:13.51 1st Qu.:41.74 1st Qu.:1009.1 1st Qu.: 63.34
## Median :20.34 Median :52.08 Median :1013.0 Median : 74.98
## Mean :19.65 Mean :54.30 Mean :1013.3 Mean : 73.32
## 3rd Qu.:25.72 3rd Qu.:66.54 3rd Qu.:1017.3 3rd Qu.: 84.84
## Max. :37.11 Max. :81.56 Max. :1033.3 Max. :100.16
## pe
## Min. :420.3
## 1st Qu.:439.8
## Median :451.6
## Mean :454.4
## 3rd Qu.:468.4
## Max. :495.8
Matrice di correlazione di Pearson
cor(data_clean)
## at v ap rh pe
## at 1.0000000 0.8442329 -0.50740590 -0.54252332 -0.9481211
## v 0.8442329 1.0000000 -0.41378740 -0.31236277 -0.8698600
## ap -0.5074059 -0.4137874 1.00000000 0.09913437 0.5183374
## rh -0.5425233 -0.3123628 0.09913437 1.00000000 0.3896975
## pe -0.9481211 -0.8698600 0.51833742 0.38969749 1.0000000
L’indice di correlazione suggerisce che sussiste un grado di dipendeza, talvolta forte, dei regressori con la variabile dipendente pe. La variabile più correlata con il target è il regressore at, tramite una forte dipendenza negativa.
Scatter plot dei regressori vs variabile dipendente:
plot_ly(data_clean, x=~at, y=~pe, mode = 'markers', type = 'scatter')
plot_ly(data_clean, x=~v, y=~pe, mode = 'markers', type = 'scatter')
plot_ly(data_clean, x=~ap, y=~pe, mode = 'markers', type = 'scatter')
plot_ly(data_clean, x=~rh, y=~pe, mode = 'markers', type = 'scatter')
Dato che le covariate sono 4 e le osservazioni 9558, i gradi di libertà dei modelli saranno elevati. Di conseguenza non risulterà necessario effettuare un processo di variable selection.
Dal momento che la variabile dipendente pe è quantitativa continua, il problema di fitting è riconducile alla classe dei modelli supervisionati applicabili a contesti di regressione.
Pirma di testare i modelli è necessario: * definire la metrica da minimizzare e utilizzare per confrontare il fit del modelli: RMSE * dotarsi di un metodo di train/test che consenta di evitare il problema dell’overfitting e verificare che il modello si comporti al meglio in contesti predittivi oltre che esplicativi.
Scelgo di applicare una k-cross-validation (non ripetuta) con 5 folds: il dataset sarà diviso in 5 gruppi ed il modello sarà stimato (train) su 4 gruppi (80%) e poi applicato (test) su di 1 gruppo (20%). COn questo metodo tutte le osservazioni sono utilizzate sia per stimare il modello che per controllare il fit.
fitControl <- trainControl(method = "cv", number = 5)
Come primo step vediamo come si comporta la regressione lineare multipla: questo modello sarà usato come benchmark per valutare le performance di modelli di ML più sofisticati:
set.seed(8848)
lm1 <- train(pe~., data = data_clean, trControl=fitControl, method = 'lm')
lm1
## Linear Regression
##
## 9558 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 7646, 7646, 7647, 7646, 7647
## Resampling results:
##
## RMSE Rsquared MAE
## 4.5614 0.9287105 3.630108
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
Generalized Additive Model
# Could be slow
set.seed(8848)
gam1 <- train(pe~., data = data_clean, trControl=fitControl, method = 'gam')
gam1
## Generalized Additive Model using Splines
##
## 9558 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 7646, 7646, 7647, 7646, 7647
## Resampling results across tuning parameters:
##
## select RMSE Rsquared MAE
## FALSE 4.161785 0.9405765 3.226169
## TRUE 4.161912 0.9405721 3.226565
##
## Tuning parameter 'method' was held constant at a value of GCV.Cp
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were select = FALSE and method
## = GCV.Cp.
Boosted Generalized Additive Model
set.seed(8848)
# Could be slow
gamboost1 <- train(pe~., data = data_clean, trControl=fitControl, method = 'gamboost')
gamboost1
## Boosted Generalized Additive Model
##
## 9558 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 7646, 7646, 7647, 7646, 7647
## Resampling results across tuning parameters:
##
## mstop RMSE Rsquared MAE
## 50 4.407041 0.9344974 3.480719
## 100 4.268768 0.9376009 3.358154
## 150 4.239975 0.9383568 3.327934
##
## Tuning parameter 'prune' was held constant at a value of no
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mstop = 150 and prune = no.
Lasso regression
set.seed(8848)
lasso1 <- train(pe~., data = data_clean, trControl=fitControl, method = 'lasso')
lasso1
## The lasso
##
## 9558 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 7646, 7646, 7647, 7646, 7647
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.1 15.149830 0.8990594 13.144539
## 0.5 8.067583 0.9079672 6.732064
## 0.9 4.661088 0.9264860 3.717352
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.9.
Ridge regression
set.seed(8848)
ridge1 <- train(pe~., data = data_clean, trControl=fitControl, method = 'ridge')
ridge1
## Ridge Regression
##
## 9558 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 7646, 7646, 7647, 7646, 7647
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0e+00 4.561400 0.9287105 3.630108
## 1e-04 4.561401 0.9287104 3.630130
## 1e-01 4.873008 0.9200824 3.838092
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.
Polynomial Regression Faccio variare i gradi del polinomio
# Could be slow
model_poly <- as.data.frame(matrix(0, ncol = 5, nrow = 0))
names(model_poly) <- c("i","j","k","h","RMSE")
r<-1
for(i in 1:3) {
for(j in 1:3){
for(k in 1:3){
for(h in 1:3){
f_name <-paste0("pe~poly(at,",i,")+poly(v,",j,")+poly(ap,",k,")+poly(rh,",h,")")
form <- formula(f_name)
set.seed(8848)
model <- train(form, data = data_clean, trControl=fitControl, method = 'lm')
model_poly[r,"i"] <- i
model_poly[r,"j"] <- j
model_poly[r,"h"] <- h
model_poly[r,"k"] <- k
model_poly[r,"RMSE"] <- model$results$RMSE
r <- r+1
}
}
}
}
model_poly[which.min(model_poly$RMSE),]
| i | j | k | h | RMSE | |
|---|---|---|---|---|---|
| 81 | 3 | 3 | 3 | 3 | 4.245404 |
Linear Model with Interactions
set.seed(8848)
int1 <- train(pe~at*v*ap*rh, data = data_clean, trControl=fitControl, method = 'lm')
int1
## Linear Regression
##
## 9558 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 7646, 7646, 7647, 7646, 7647
## Resampling results:
##
## RMSE Rsquared MAE
## 4.272396 0.9374021 3.337471
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
Random Forest Con CV il modello non è ottimizzato: utilizzo un approccio train-test 80-20 classico.
# Could be slow
seq <- seq(1,nrow(data_clean),1)
s <- sample(seq,as.integer(nrow(data_clean)*0.8))
data_train_rf <- data_clean[s,]
data_test_rf <- data_clean[-s,]
set.seed(8848)
rf1 <- randomForest::randomForest(pe~., data = data_train_rf, ntree = 200, mtry = 3, importance = TRUE)
data_test_rf$predict <-predict(rf1, data_test_rf)
data_test_rf$residuals <- data_test_rf$predict - data_test_rf$pe
rf_RMSE <- sqrt(mean((data_test_rf$residuals)^2))
Scelgo il modello che minimizza RMSE:
model_results <- as.data.frame(rbind(
cbind(lm1$results$RMSE, "lm1","linear model"),
cbind(gamboost1$results$RMSE, "gamboost1", "gamboost"),
cbind(gam1$results$RMSE, "gam1", "gam"),
cbind(lasso1$results$RMSE, "lasso1","lasso"),
cbind(ridge1$results$RMSE, "ridge1","ridge"),
cbind(model_poly[which.min(model_poly$RMSE),"RMSE"], "model_poly", "Polynomial lm"),
cbind(int1$results$RMSE, "int1", "Lm with interactions"),
cbind(rf_RMSE, "rf1", "random forest")
))
names(model_results) <- c("RMSE", "model", "type")
model_results$RMSE <- gsub(" ", "",model_results$RMSE)
pos_best <- which.min(as.numeric(model_results$RMSE))
model_results[pos_best,]
| RMSE | model | type | |
|---|---|---|---|
| 15 | 3.61148263174888 | rf1 | random forest |
Summary del modello finale:
get(as.character(model_results[pos_best,"model"]))
##
## Call:
## randomForest(formula = pe ~ ., data = data_train_rf, ntree = 200, mtry = 3, importance = TRUE)
## Type of random forest: regression
## Number of trees: 200
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 10.2698
## % Var explained: 96.46
Importanza delle variabili:
get(as.character(model_results[pos_best,"model"]))$importance
## %IncMSE IncNodePurity
## at 374.718235 1633981.74
## v 50.353594 496026.18
## ap 10.087611 42122.85
## rh 9.093884 37690.78
Ho scelto il modello che minimizza RMSE. Avendo utilizzato un approccio di model selection robusto, non incorro in problemi di overfitting. L’tilizzo di un modello di ML migliora le performance predittive del 0.21% rispetto al benchmark (regressione lineare multipla), che già godeva di un buon fit ( R-squared = 0.9287105)